For what purpose was the dataset created?
Who funded the creation of the dataset?
What do the instances in this dataset represent?
Are there recommended data splits?
Does the dataset contain data that might be considered sensitive in any way?
Was there any data preprocessing performed?
Additional Information
from ucimlrepo import fetch_ucirepo
# fetch dataset
cdc_diabetes_health_indicators = fetch_ucirepo(id=891)
# data (as pandas dataframes)
X = cdc_diabetes_health_indicators.data.features
y = cdc_diabetes_health_indicators.data.targets
print(len(X.columns))
21
print(y.value_counts())
Diabetes_binary 0 218334 1 35346 Name: count, dtype: int64
X.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 253680 entries, 0 to 253679 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 HighBP 253680 non-null int64 1 HighChol 253680 non-null int64 2 CholCheck 253680 non-null int64 3 BMI 253680 non-null int64 4 Smoker 253680 non-null int64 5 Stroke 253680 non-null int64 6 HeartDiseaseorAttack 253680 non-null int64 7 PhysActivity 253680 non-null int64 8 Fruits 253680 non-null int64 9 Veggies 253680 non-null int64 10 HvyAlcoholConsump 253680 non-null int64 11 AnyHealthcare 253680 non-null int64 12 NoDocbcCost 253680 non-null int64 13 GenHlth 253680 non-null int64 14 MentHlth 253680 non-null int64 15 PhysHlth 253680 non-null int64 16 DiffWalk 253680 non-null int64 17 Sex 253680 non-null int64 18 Age 253680 non-null int64 19 Education 253680 non-null int64 20 Income 253680 non-null int64 dtypes: int64(21) memory usage: 40.6 MB
X.describe()
| HighBP | HighChol | CholCheck | BMI | Smoker | Stroke | HeartDiseaseorAttack | PhysActivity | Fruits | Veggies | ... | AnyHealthcare | NoDocbcCost | GenHlth | MentHlth | PhysHlth | DiffWalk | Sex | Age | Education | Income | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | ... | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 | 253680.000000 |
| mean | 0.429001 | 0.424121 | 0.962670 | 28.382364 | 0.443169 | 0.040571 | 0.094186 | 0.756544 | 0.634256 | 0.811420 | ... | 0.951053 | 0.084177 | 2.511392 | 3.184772 | 4.242081 | 0.168224 | 0.440342 | 8.032119 | 5.050434 | 6.053875 |
| std | 0.494934 | 0.494210 | 0.189571 | 6.608694 | 0.496761 | 0.197294 | 0.292087 | 0.429169 | 0.481639 | 0.391175 | ... | 0.215759 | 0.277654 | 1.068477 | 7.412847 | 8.717951 | 0.374066 | 0.496429 | 3.054220 | 0.985774 | 2.071148 |
| min | 0.000000 | 0.000000 | 0.000000 | 12.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 |
| 25% | 0.000000 | 0.000000 | 1.000000 | 24.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | ... | 1.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 4.000000 | 5.000000 |
| 50% | 0.000000 | 0.000000 | 1.000000 | 27.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 5.000000 | 7.000000 |
| 75% | 1.000000 | 1.000000 | 1.000000 | 31.000000 | 1.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 0.000000 | 3.000000 | 2.000000 | 3.000000 | 0.000000 | 1.000000 | 10.000000 | 6.000000 | 8.000000 |
| max | 1.000000 | 1.000000 | 1.000000 | 98.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 5.000000 | 30.000000 | 30.000000 | 1.000000 | 1.000000 | 13.000000 | 6.000000 | 8.000000 |
8 rows × 21 columns
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
from imblearn.combine import SMOTEENN
cnt_before = y_train.value_counts()
X_train_os, y_train_os = SMOTEENN().fit_resample(X_train, y_train)
cnt_after = y_train_os.value_counts()
print(cnt_before, cnt_after)
Diabetes_binary 0 152764 1 24812 Name: count, dtype: int64 Diabetes_binary 1 136026 0 101618 Name: count, dtype: int64
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
error_rate = []
for i in range(1,20):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train_os, y_train_os.values.ravel())
y_pred_knn = knn.predict(X_test)
error_rate.append(np.mean(y_pred_knn != y_test.values.ravel()))
plt.figure(figsize=(8,5))
plt.plot(range(1,20),error_rate, marker='o', markersize=3)
[<matplotlib.lines.Line2D at 0x176a8ab82b0>]
I will select 4 as n_neighbors.
knn = KNeighborsClassifier(n_neighbors=4)
knn.fit(X_train_os, y_train_os.values.ravel())
y_pred_knn = knn.predict(X_test)
accuracy_score(y_test, y_pred_knn)
0.6969804478082624
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
min_samples_leafs = np.linspace(1, 11, 11, dtype = int,endpoint=True)
for i in range(5):
train_results = []
test_results = []
for min_samples_leaf in min_samples_leafs:
dt = DecisionTreeClassifier(min_samples_leaf=min_samples_leaf)
dt.fit(X_train_os, y_train_os)
train_pred = dt.predict(X_train_os)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train_os, train_pred)
roc_auc = auc(false_positive_rate, true_positive_rate)
train_results.append(roc_auc)
y_pred_dt = dt.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_dt)
roc_auc = auc(false_positive_rate, true_positive_rate)
test_results.append(roc_auc)
from matplotlib.legend_handler import HandlerLine2D
plt.figure(figsize=(5, 3))
line1, = plt.plot(min_samples_leafs, train_results, 'b', label='Train AUC')
line2, = plt.plot(min_samples_leafs, test_results, 'r', label='Test AUC')
plt.legend(handler_map={line1: HandlerLine2D(numpoints=2)})
plt.ylabel('AUC score')
plt.xlabel('min_samples_leaf')
plt.show()
I select 6 as min_samples_leaf. It is because after 6 it seems to exist a constant gradual increase in AUC score. I just wanted to avoid overfitting.
dt = DecisionTreeClassifier(min_samples_leaf=6)
dt = dt.fit(X_train_os, y_train_os)
y_pred_dt = dt.predict(X_test)
accuracy_score(y_test, y_pred_dt)
0.7844265741616735
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
max_features_n = np.linspace(1, 10, 10, dtype = int,endpoint=True) #21/3 = 7
for i in range(5):
error_rate = []
test_results = []
for max_features in max_features_n:
rf = RandomForestClassifier(max_features = max_features)
rf.fit(X_train_os, y_train_os.values.ravel())
y_pred_rf = rf.predict(X_test)
false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_pred_rf)
roc_auc = auc(false_positive_rate, true_positive_rate)
test_results.append(roc_auc)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
error_rate.append(rmse)
fig, ax1 = plt.subplots(figsize=(5, 3))
ax2 = ax1.twinx()
ax1.plot(max_features_n, test_results, 'b', label='Test AUC')
ax2.plot(max_features_n, error_rate, 'r', label='Test RMSE')
ax1.set_xlabel('max_features')
ax1.set_ylabel('AUC score', color='b')
ax2.set_ylabel('RMSE', color='r')
plt.show()
I go for 6 to get predictions.
rf = RandomForestClassifier(max_features = 6)
rf = rf.fit(X_train_os, y_train_os.values.ravel())
y_pred_rf = rf.predict(X_test)
accuracy_score(y_test, y_pred_rf)
0.7914170083044255
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
fig, axs = plt.subplots(10, 10, figsize=(50, 50))
for i in range(10):
for j in range(10):
model = XGBClassifier(n_estimators = 20+10*i, max_depth = 4+j)
evalset = [(X_train_os, y_train_os), (X_test,y_test)]
model.fit(X_train_os, y_train_os, eval_set=evalset, verbose=False)
model.set_params(eval_metric = 'logloss')
y_pred_xgb = model.predict(X_test)
score = accuracy_score(y_test, y_pred_xgb)
results = model.evals_result()
axs[i, j].plot(results['validation_0']['logloss'], label='train')
axs[i, j].plot(results['validation_1']['logloss'], label='test')
axs[i, j].set_title(str(20+10*i) + ', ' + str(4+j))
I would like to select 4 as max_depth, initially.
fig, axs = plt.subplots(1, 10, figsize=(50, 5))
for i in range(10):
model = XGBClassifier(n_estimators = 20+10*i, max_depth = 4)
#evalset = [(X_train_os, y_train_os), (X_test,y_test)]
model.fit(X_train_os, y_train_os, eval_set=evalset, verbose=False)
model.set_params(eval_metric = 'logloss')
y_pred_xgb = model.predict(X_test)
score = accuracy_score(y_test, y_pred_xgb)
results = model.evals_result()
axs[i].plot(results['validation_0']['logloss'], label='train')
axs[i].plot(results['validation_1']['logloss'], label='test')
axs[i].set_title(str(20+10*i) + ', ' + str(4))
It seems like we may have more n_estimators to see the chart.
fig, axs = plt.subplots(1, 10, figsize=(50, 5))
for i in range(10):
model = XGBClassifier(n_estimators = 100+10*i, max_depth = 4)
#evalset = [(X_train_os, y_train_os), (X_test,y_test)]
model.fit(X_train_os, y_train_os, eval_set=evalset, verbose=False)
model.set_params(eval_metric = 'logloss')
y_pred_xgb = model.predict(X_test)
score = accuracy_score(y_test, y_pred_xgb)
results = model.evals_result()
axs[i].plot(results['validation_0']['logloss'], label='train')
axs[i].plot(results['validation_1']['logloss'], label='test')
axs[i].set_title(str(100+10*i) + ', ' + str(4))
It may still go down. I will go back to my first plot to change the intervals. I may change my initial selection of max_depth.
fig, axs = plt.subplots(10, 10, figsize=(50, 50))
for i in range(10):
for j in range(10):
model = XGBClassifier(n_estimators = 180+10*i, max_depth = 4+j)
evalset = [(X_train_os, y_train_os), (X_test,y_test)]
model.fit(X_train_os, y_train_os, eval_set=evalset, verbose=False)
model.set_params(eval_metric = 'logloss')
y_pred_xgb = model.predict(X_test)
score = accuracy_score(y_test, y_pred_xgb)
results = model.evals_result()
axs[i, j].plot(results['validation_0']['logloss'], label='train')
axs[i, j].plot(results['validation_1']['logloss'], label='test')
axs[i, j].set_title(str(180+10*i) + ', ' + str(4+j))
Ultimately, I select the pair of 270 and 5.
xgb = XGBClassifier(n_estimators = 270, max_depth = 5)
xgb.fit(X_train_os, y_train_os.values.ravel())
y_pred_xgb = xgb.predict(X_test)
accuracy_score(y_test, y_pred_xgb)
0.8120729528014297
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import absolute
from numpy import sqrt
cv = KFold(n_splits=10, random_state=1, shuffle=True)
for clf in [knn, dt, rf, xgb]:
scores = cross_val_score(clf, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
print(clf, ' : ', mean(absolute(scores)))
KNeighborsClassifier(n_neighbors=4) : 0.1454864396089562
DecisionTreeClassifier(min_samples_leaf=6) : 0.15899164301482183
RandomForestClassifier(max_features=6) : 0.14130794701986754
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=5, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=270, n_jobs=None,
num_parallel_tree=None, random_state=None, ...) : 0.13468543046357617
for clf in [[knn, y_pred_knn], [dt, y_pred_dt], [rf, y_pred_rf], [xgb, y_pred_xgb]]:
print(clf[0], ' : ', accuracy_score(y_test, clf[1]))
KNeighborsClassifier(n_neighbors=4) : 0.6969804478082624
DecisionTreeClassifier(min_samples_leaf=6) : 0.7844265741616735
RandomForestClassifier(max_features=6) : 0.7914170083044255
XGBClassifier(base_score=None, booster=None, callbacks=None,
colsample_bylevel=None, colsample_bynode=None,
colsample_bytree=None, device=None, early_stopping_rounds=None,
enable_categorical=False, eval_metric=None, feature_types=None,
gamma=None, grow_policy=None, importance_type=None,
interaction_constraints=None, learning_rate=None, max_bin=None,
max_cat_threshold=None, max_cat_to_onehot=None,
max_delta_step=None, max_depth=5, max_leaves=None,
min_child_weight=None, missing=nan, monotone_constraints=None,
multi_strategy=None, n_estimators=270, n_jobs=None,
num_parallel_tree=None, random_state=None, ...) : 0.8120729528014297
Accuracy scores and KFold CV scores led me to choose XGB.